import traceback

import PyPDF2
from util.common import remove_garbled_characters


async def extract_text_from_pdf(pdf_path):
    try:
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            text = ""
            for page in pdf_reader.pages:
                t = page.extract_text()
                text += await remove_garbled_characters(t)
            # 简单的格式规整，去除多余的空白行
            formatted_text = "\n".join([line.strip() for line in text.splitlines() if line.strip()])
            return formatted_text
    except FileNotFoundError as e:
        print(traceback.print_stack())
        print(f"错误: 发生未知错误 - {e}")
    return ""


if __name__ == "__main__":
    pdf_path = '../data/pdf/《中国联通数据安全日志审计规范》.pdf'
    extracted_text = extract_text_from_pdf(pdf_path)
    if extracted_text:
        print(extracted_text)
